
/****************************************************************************************************                                                                                                   
 * This .do file performs the following tasks:
 * 
 * 1. Imports raw baseline data
 * 2. Labels variables and choices for survey versions
 * 3. Cleans data:
 *    - Drops unnecessary variables
 *    - Recodes variables and prepares them for analysis
 *    - Saves required variables with base IDs to be merged with endline data
 *    - Merges baseline cleaned data with randomized teams and saves data for balance check tables
 ****************************************************************************************************/

*-----------------------------------
 * Setup Environment
 *-----------------------------------
set more off
clear all
set segmentsize 3g  /* Setting memory */


*-----------------------------------
 * Importing Raw Baseline Data
 *-----------------------------------


* Load the raw data file
use "$Data/Original/baseline_raw.dta", clear

* Drop identifying variables
drop Enumerator_Name Employee_Name 

* Sort and keep relevant variables
sort _id
keep Years_Schooling SkillGrade EmploymentType Contractor_name ProductionLine Religion Caste political_support_2019 ///
     political_support_2014 NRC_support attitude_neighbour trust_scale donate_choice attitude_marriage_religion ///
     attitude_communication_religion attitude_orders_religion YearJoined EnterEmployeeDateofBirthD ///
     interaction_cross_rel hm_ratio_wf hm_ratio_nf _id deviceid IAT_score Type_Schooling ///
     Residence_type_family residence_size attitude_lend_religion

order _id deviceid


* Convert _id to numeric if necessary
destring _id, replace


/*************************************
 *          Data Cleaning
 *************************************/

*-----------------------------
 * School Years
 *-----------------------------
destring Years_Schooling, generate(school_years)
replace school_years = 0 if school_years == .  /* Missing if did not go to school */

*-----------------------------
 * Encode Skill-Grade
 *-----------------------------
encode SkillGrade, generate(skill)

*-----------------------------
 * Employment Contract Type
 *-----------------------------
replace EmploymentType = trim(EmploymentType) 
replace Contractor_name = "JJE" if Contractor_name == "JJJE"
replace Contractor_name = "MKE" if inlist(Contractor_name, "MKe", "Mke", "MkE")

* Encode Employment Type
encode EmploymentType, generate(employment)

*-----------------------------
 * Encoding Caste and Religion
 *-----------------------------
replace Religion = trim(Religion)
encode Religion, generate(religion)
encode Caste, generate(caste)

*-----------------------------
 * Political Support
 *-----------------------------
replace political_support_2019 = lower(political_support_2019)
generate pol_support19 = (political_support_2019 == "bjp")
replace pol_support19 = . if inlist(political_support_2019, "", "did not vote", "didnt vote", ///
    "uneligible to vote", "na", "refuse to answer", "refused to answer")

replace political_support_2014 = lower(political_support_2014)
generate pol_support14 = (political_support_2014 == "bjp")

* Corrected line to handle more than 10 arguments in inlist()
replace pol_support14 = . if ///
    inlist(political_support_2014, "", "did not vote", "didnt vote", "ineligible to vote", "na", "n/a", "refuse to answer", "refused to answer") 
	
replace pol_support14 = . if ///	
    inlist(political_support_2014, "age criteria not met", "dont remember", "out of state", "not voted", "not applicable")

*-----------------------------
 * Support for NRC
 *-----------------------------
generate nrc_support = (NRC_support == "Yes")
replace nrc_support = . if inlist(NRC_support, "", "Dont Know", "Dont Know ", "Refuse to Answer")

*-----------------------------
 * Cross-Religion Interaction
 *-----------------------------
generate int_cross = (interaction_cross_rel == "More than 5")
replace int_cross = 0.5 if interaction_cross_rel == "Less than 5"

*-----------------------------
 * Attitude Towards Neighbors
 *-----------------------------
generate neighbour_attitude = (attitude_neighbour == "No")
replace neighbour_attitude = 0.5 if attitude_neighbour == "Would not prefer, but would be ok"
replace neighbour_attitude = . if attitude_neighbour == ""

*-----------------------------
 * Generalized Trust (WVS)
 *-----------------------------
destring trust_scale, generate(trust)

*-----------------------------
 * Altruism (WVS)
 *-----------------------------
destring donate_choice, generate(donate)
generate ln_donate = log(1 + donate)

*-----------------------------
 * Attitudes Towards Inter-Religious Marriage
 *-----------------------------
generate marriage_religion = (attitude_marriage_religion == "No")
replace marriage_religion = 0.5 if attitude_marriage_religion == "Would not prefer, but would be ok"
replace marriage_religion = . if inlist(attitude_marriage_religion, "", "Dont Know")

*-----------------------------
* More Comfortable Communicating with Coreligionists
 *-----------------------------
generate comm_rel = (attitude_communication_religion == "Always")
replace comm_rel = 0.5 if attitude_communication_religion == "Sometimes"
replace comm_rel = . if inlist(attitude_communication_religion, "", "Dont Know")

* Recoding Communication Variables for Consistent Coefficient Signs
foreach var of varlist comm_rel {
    replace `var' = 2 if `var' == 1
    replace `var' = 1 if `var' == 0
    replace `var' = 0 if `var' == 2
}

*-----------------------------
 * Comfortable Taking Orders from Non-Coreligionists
 *-----------------------------
generate orders_religion = (attitude_orders_religion == "Always") /* 0 if  Sometimes or Never comfortable */
replace orders_religion = . if inlist(attitude_orders_religion, "", "Refuse to Answer", "Dont Know")

*-----------------------------
 * Age and Tenure
 *-----------------------------
generate DOJ = date(YearJoined, "YMD")
format DOJ %td
generate yoj = year(DOJ)
generate tenure = 2019 - yoj
replace tenure = 0.5 if tenure < 1  /* Assign 0.5 if joined in the same year */

*-----------------------------
 * Date of Birth
 *-----------------------------
generate DOB = date(EnterEmployeeDateofBirthD, "YMD")
format DOB %td
generate yob = year(DOB)
generate age = 2019 - yob



*-----------------------------
 * IAT Score
 *-----------------------------
destring IAT_score, generate(iat)
generate hindu_iat = iat if religion == 1
generate muslim_iat = iat if religion == 2

* Dummy Variable for Strong IAT Bias (Not used in the paper)
generate iat_b = 0
replace iat_b = 1 if (hindu_iat > 0.65 & !missing(hindu_iat))
replace iat_b = 1 if (muslim_iat < -0.65 & !missing(muslim_iat))
replace iat_b = . if missing(iat)

*-----------------------------
 * Labeling Variables
 *-----------------------------
label variable school_years "Till which grade did you study in school?"
label variable tenure "Years worked at the factory"
label variable age "Age in Years"
label variable trust "WVS measure of trust"
label variable ln_donate "WVS measure of altruism"
label variable orders_religion "Are you comfortable listening to orders from a non-coreligionist?"
label variable neighbour_attitude "Would you live next to a non-coreligionist? (Hindu/Muslim)"
label variable int_cross "Number of people you interact with daily outside your religion (Hindu/Muslim)"
label variable comm_rel "Do you find it easier to communicate with co-religionists?"
label variable Type_Schooling "Type of school attended"
label variable Residence_type_family "Do you currently live with your family?"
label variable residence_size "Total number of people living in your house"
label variable marriage_religion "Would you be okay if your children married outside your religion? (Hindu/Muslim)"
label variable pol_support14 "Political support in the 2014 Indian General Elections"
label variable pol_support19 "Political support in the 2019 Indian General Elections"
label variable nrc_support "Do you support the proposed National Register of Citizens in West Bengal?"
label variable attitude_lend_religion "Would you lend money to a person from another religion? (Hindu/Muslim)" 
label variable donate_choice "Amount you would donate from Rs 2400"
label variable iat "IAT Score"
label variable iat_b "Strong IAT Bias"


*-------------------------------------
* Prepare Final Cleaned Baseline Data
*-------------------------------------
 
* These are used as baseline controls in endline regressions so adding "b" at the end.  
rename comm_rel comm_relb
rename orders_religion orders_relb

keep _id Religion Caste religion school_years residence_size trust donate_choice attitude_lend_religion ///
     IAT_score NRC_support employment pol_support19 pol_support14 nrc_support int_cross ///
     neighbour_attitude ln_donate marriage_religion comm_relb orders_relb iat_b tenure age skill 

save "$Data/Final/Baseline_Cleaned.dta", replace

/****************************************
 *  Merging with Randomized Teams
 ****************************************/

clear all

* Load Randomized Data
use "$Data/Original/Randomized_Teams.dta", clear

* Merge with Cleaned Baseline Data
merge 1:1 _id using "$Data/Final/Baseline_Cleaned.dta", keep(3) nogen

* Encode Production Lines
encode Line_R, generate(line_r)

* Create Group/Team Identifiers
egen line_sec  = group(Line_R new_section)
egen line_sec_team  = group(Line_R team new_section)
egen team_treat = group(Line_R team direct mixed)

*-----------------------------
 * Past Contact at Work
 *-----------------------------
generate muslim = (religion == 2)
generate hindu  = (religion == 1)

egen count_total_line    = count(_id), by(Line_R shift)
egen count_total_line_s  = count(_id), by(Line_R shift section)
egen count_muslim_line   = sum(muslim), by(Line_R shift)
egen count_muslim_line_s = sum(muslim), by(Line_R shift section)
egen count_hindu_line    = sum(hindu), by(Line_R shift)
egen count_hindu_line_s  = sum(hindu), by(Line_R shift section)

*-----------------------------
 * Exposure from Other Sections
 *-----------------------------
generate share_m_wo = (count_muslim_line - count_muslim_line_s) / (count_total_line - count_total_line_s)
generate share_h_wo = (count_hindu_line - count_hindu_line_s) / (count_total_line - count_total_line_s)

generate mean_contact_hindu_l   = share_m_wo if religion == 1
generate mean_contact_muslim_l  = share_h_wo if religion == 2

*-----------------------------
 * Contact in Own Section
 *-----------------------------
egen mean_contact_ls = mean(muslim), by(Line_R shift section)
egen mc             = mean(hindu), by(Line_R shift section)
replace mean_contact_ls = mc if religion == 2

generate mean_contact_hindu_ls  = mean_contact_ls if religion == 1
generate mean_contact_muslim_ls = mean_contact_ls if religion == 2

*-----------------------------
 * Share of Coreligionists
 *-----------------------------
generate mean_corel_hindu_ls    = 1 - mean_contact_hindu_ls
generate mean_corel_muslim_ls   = 1 - mean_contact_muslim_ls
generate mean_corel_hindu_l     = 1 - mean_contact_hindu_l
generate mean_corel_muslim_l    = 1 - mean_contact_muslim_l

*-----------------------------
 * Share of Coreligionists in Line Section (Different Shifts)
 *-----------------------------
egen count_hindu_ls  = sum(hindu), by(Line_R section)
egen count_muslim_ls = sum(muslim), by(Line_R section)
egen count_ls         = count(_id), by(Line_R section)

generate mean_corel_hindus_ols  = (count_hindu_ls - count_hindu_line_s) / (count_ls - count_total_line_s) if religion == 1
generate mean_corel_muslim_ols = (count_muslim_ls - count_muslim_line_s) / (count_ls - count_total_line_s) if religion == 2

* Drop Intermediate Variables
drop muslim hindu count_total_line count_total_line_s count_muslim_line count_muslim_line_s ///
     count_hindu_line count_hindu_line_s mc count_ls count_hindu_ls count_muslim_ls

*-----------------------------
 * Labeling Contact Variables
 *-----------------------------
label variable mean_contact_hindu_l      "Average baseline exposure (Hindus to Muslims) from other sections"
label variable mean_contact_muslim_l     "Average baseline exposure (Muslims to Hindus) from other sections"
label variable mean_contact_ls           "Average baseline exposure in own section"
label variable mean_contact_hindu_ls     "Average baseline exposure (Hindus to Muslims) in own section"
label variable mean_contact_muslim_ls    "Average baseline exposure (Muslims to Hindus) in own section"
label variable mean_corel_hindu_ls       "Share of co-religionists in own section: Hindus"
label variable mean_corel_muslim_ls      "Share of co-religionists in own section: Muslims"
label variable mean_corel_hindu_l        "Share of co-religionists in line (excluding own section): Hindus"
label variable mean_corel_muslim_l       "Share of co-religionists in line (excluding own section): Muslims"

*-----------------------------
 * Line-Level Treatment Indicator
 *-----------------------------
 
/* 

This part is commented out since it had identifying info because production lines are named by product brands. Line names have been replace by numbers here. During randomization the letters A, B and C were used to refer to cohorts within a line, but this confused supervisors because they use A, B and C to refer to shifts, so later the cohort letters were changed to X, Y and Z.

generate HD_mixed = 0
replace HD_mixed  = 1 if inlist(Line, "Line 1") & inlist(team, "A", "B")
replace HD_mixed  = 1 if inlist(Line, "Line 3") & inlist(team, "A")
replace HD_mixed  = 1 if inlist(Line, "Line 2") & inlist(team, "A")
replace HD_mixed  = 1 if inlist(Line, "Line 4") & inlist(team, "A")
replace HD_mixed  = 1 if inlist(Line, "Line 4") & inlist(team, "A", "B")
replace HD_mixed  = . if inlist(Line, "MAIDA", "EGG")  /* Exclude lines without admin output measures */

*/
*-----------------------------
 * Save Balance Check Data
 *-----------------------------
save "$Data/Final/Balance_check.dta", replace



